rm(list = ls())
library(ggplot2)
library(GGally)
library(dplyr)
library(factoextra)
library(plotly)
library(cluster)
setwd("C:/Users/alejo/OneDrive/Documentos/UC3M/Año 2/Primer Cuatri/Statistical Learning/Assignment 1")
data = read.csv(file = "vehicles.csv")Data set link : https://www.fueleconomy.gov/feg/epadata/vehicles.csv This data set has been downloaded from the official web page of the U.S. government for fuel economy information. The target is to know what do we have in this data set that allows us to recognize which type of vehicles are better.
We need to understand the meaning of the columns in the data. In order to do so, we can follow the provided guide here https://www.fueleconomy.gov/feg/ws/index.shtml#vehicle.
MGP means miles per galon.
1 mile = 1,60934 kilometers and 1 galon = 3,78541 liters. Each city variable measures the consumption in a city with different parameters
fuelType1 - fuel type 1. For single fuel vehicles, this will be the only fuel. For dual fuel vehicles, this will be the conventional fuel.
fuelType2 - fuel type 2. For dual fuel vehicles, this will be the alternative fuel (e.g. E85, Electricity, CNG, LPG). For single fuel vehicles, this field is not used
co2 variables represent the grams/mile in the tailpipe for the different types of fuel.
comb variables represent the combined MPG consumption for each type of fuel.
cityE and combE represent the consumption of electricity for electric cars in kw-hrs/100 miles.
cylinders - engine cylinders displ - engine displacement in liters drive - drive axle type
After this, we start with the emissions list.
Some variables are related to tge engine description and we have many missing values on them. Others refer to the fuel cost, something very important for our study in order to know which vehicles are more affordable.
As before with city, highway variables represent the mean consumption in MPG of each of the models.
hlv, hpv, lv2 and lv4 variables are related to the luggage of the vehicle.
Id, make, model allow us to identify the vehicles.
Range variables are used to express the range of fuelType2 vehicles such as electric ones.
The variable youSaveSpend is very interesting because represents in 5 years the money that you gain or loose in comparison to an average car, we will see positive and negative values on it.
The variables createdOn and modifiedOn seem to be useless as soon as all the data has been created at the same time and all of values are equal.
And finally, phev variables are related to hybrid vehicles in cities, highways and combined.
models_names = unique(makers$model)
nModels = length(models_names)
car_model = data[which(makers$model == 1),]
car_model_def = as.data.frame(t(as.matrix(colMeans(car_model))))
rownames(car_model_def)[rownames(car_model_def) == 1] = models_names[1]
for(i in 2:nModels){
car_model = data[which(makers$model == models_names[i]),]
car_model_def[i,] = as.data.frame(t(as.matrix(colMeans(car_model))))
rownames(car_model_def)[rownames(car_model_def) == 1] = models_names[i]
}
data = car_model_def
ncol_of_data = ncol(data) + 1
for(i in 1:nrow(car_model_def)){
index2 = which(makers$model == rownames(car_model_def)[i])[1]
#This returns the first row position in the models column that coincides with the given model. Thanks to it, we can add the maker to each of the models in our data set.
car_model_def[i, ncol_of_data] = makers[index2,1]
}
remove(car_model)
colnames(car_model_def)[colnames(car_model_def) == "V41"] = "make"
#In this second for loop I am just adding the maker company name to the row of each of the models, so we can compute also the best car makers. Now I have done something very important. We had many different model vehicles (40.000 of them) with repeated vehicles, so I have decided to compute the mean of each model and just keep the mean results for each model, so we can reduce the data set to 4500 rows. This could lead to some problems related to the mean approximations in the variables that have been converted from categorical to numeric, but we will lead with this problems later.
In the following code, I am going to make a extra Data set that will allow us to understand much better what we have. As I have transformed the 40000 rows dataset into 4000 by grouping the cars by model, now I am going to do it by brand, so that we can classify each brand easily without noise, as an option. The final data is 133 obs and 40 variables, much smaller.
#Data Dimension Reduction up to Manufacturers
makers_small = subset(car_model_def, select = c(0, 41))
maker_names = unique(makers_small$make)
maker_names = maker_names[!maker_names %in% NA]
nMaker = length(maker_names)
car_maker = data[which(makers_small$make == "Alfa Romeo"),]
car_maker_def = as.data.frame(t(as.matrix(colMeans(car_maker))))
rownames(car_maker_def)[rownames(car_maker_def) == 1] = "Alfa Romeo"
for(i in 2:nMaker){
car_maker = data[which(makers_small$make == maker_names[i]),]
car_maker_def[i,] = as.data.frame(t(as.matrix(colMeans(car_maker))))
rownames(car_maker_def)[rownames(car_maker_def) == 1] = maker_names[i]
i=i+1
}
remove(car_maker)Now I am going to separate into 2 data sets, the one of the models and the one of the brands
ncol_of_data = ncol(data) + 1
data_models = data#Models data
data_makers = car_maker_def #Makers data
remove(data); remove(car_maker_def); remove(data_numeric); #Removing data that I will not use moreboxplot(scale(data_models), las=2, col="darkgreen")
#If you have a very high variance there will be many differences between each model, PCA will take into account the width and the correlation between all the boxes so that the final PCA box is going to be very big. Uses all the variances and the correlations.
# Then, we need to decide whether to scale or not. Let´s see the barplot of the scaled data:
boxplot(scale(data_models), las=2, col="darkgreen")PHEV means Plug-In & Electric Vehicle, and taking into account that most of the vehicles in this data set are not hybrid, the value of combinedCD and HighwayCD for most of them will be 0, meaning that those only use fuel and we will see in any barplot the hybrid and electric ones as outliers, when they are not. This is the main reason why, for this data set, as we want to find the highest performance cars in relation with consumption and speed, no car models (rows) will be removed.
## [1] 2.5 1.0 1.0 1.0 1.0 1.0
## [1] 1 1 1 1 1 1
Let´s see how the Money that you Save or Spend is related to the fuel consumption:
We can see that the more fuel the car consumes, the more money you spend (Less money you save), as predicted.Now, we are going to check which type of fuel, each of the original samples uses:
ggplot(data_original) + aes(x = fuelType) + geom_bar(aes(fill = fuelType)) + coord_polar("y", start = 0)We can see that 30 thousand vehicles are classified as Regular fuel, 13000 as Premium fuel and just a few (~1000) are electric or hybrid, that is the real proportion nowadays in the vehicles market.
p = ggplot(data_makers)+aes(y = barrels08,
fill = reorder(rownames(data_makers), barrels08, median))+
geom_boxplot()+ labs(fill = "Home Team") +
theme(legend.position = "bottom"); ggplotly(p)We can see that electric companies and the ones with hybrid models, have the lowest barrels anual consumption.
We are going to go from dimension 40, which means that we have 40 variables, to a smaller dimension that we can manage.
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.2675 2.5999 2.0170 1.98046 1.54171 1.22367 1.1832
## Proportion of Variance 0.2669 0.1690 0.1017 0.09806 0.05942 0.03743 0.0350
## Cumulative Proportion 0.2669 0.4359 0.5376 0.63567 0.69509 0.73252 0.7675
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 1.03189 0.96068 0.93347 0.91280 0.87663 0.80707 0.7456
## Proportion of Variance 0.02662 0.02307 0.02178 0.02083 0.01921 0.01628 0.0139
## Cumulative Proportion 0.79414 0.81721 0.83900 0.85983 0.87904 0.89532 0.9092
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.63001 0.60881 0.5761 0.56811 0.5478 0.52400 0.4606
## Proportion of Variance 0.00992 0.00927 0.0083 0.00807 0.0075 0.00686 0.0053
## Cumulative Proportion 0.91914 0.92841 0.9367 0.94478 0.9523 0.95914 0.9645
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.43042 0.41772 0.40414 0.37891 0.35254 0.31830 0.29021
## Proportion of Variance 0.00463 0.00436 0.00408 0.00359 0.00311 0.00253 0.00211
## Cumulative Proportion 0.96908 0.97344 0.97752 0.98111 0.98422 0.98675 0.98886
## PC29 PC30 PC31 PC32 PC33 PC34 PC35
## Standard deviation 0.26949 0.26699 0.2608 0.23246 0.19415 0.18354 0.17187
## Proportion of Variance 0.00182 0.00178 0.0017 0.00135 0.00094 0.00084 0.00074
## Cumulative Proportion 0.99067 0.99246 0.9942 0.99551 0.99645 0.99729 0.99803
## PC36 PC37 PC38 PC39 PC40
## Standard deviation 0.15868 0.13668 0.12336 0.11450 0.08127
## Proportion of Variance 0.00063 0.00047 0.00038 0.00033 0.00017
## Cumulative Proportion 0.99866 0.99913 0.99951 0.99983 1.00000
By scaling we are “missing information”, but for this data set, it is the only way to be able to understand the data and make PCA work. first row is the standard deviation for each component. In the first box there is the 26% of the information, the second 17%, in the third box 10%, the forth is 9.8%, fifth is 6% . In 5 dimensions we will be able to see 70% of the information.
After seeing the plot, I decide to keep the first 4 variables only. With the first 4 components I can explain 64% of the variability
#First component:
Now, I am going to plot the squared loading, the contribution of the variables to each of the components
fviz_contrib(pca, choice = "var", axes = 1) #This is the same but squared. The red line is the average. The red dashed line on the graph above indicates the expected average contribution
If the contribution of the variables were uniform, the expected value would be 1/length(variables) = 1/40 = 2.5%, and we can see that this is almost the case, because the average contribution is 2.3%.
The most important contributor to this first component, is the atvType, ghgScore, or the fuel that each vehicle uses, what supports really good that we are going to classify between electric and fuel cars.
## [1] "RAV4 Prime 4WD" "Model S Plaid (21in Wheels)"
## [3] "Spider Veloce 2000" "Model S Long Range Plus"
## [5] "Model X Long Range Plus" "Model S Long Range"
## [7] "Model X Performance (20in Wheels)" "Model S Performance (19in Wheels)"
## [9] "Model S Performance (21in Wheels)" "Model X Long Range"
## [1] "Toyota" "Tesla" "Alfa Romeo" "Tesla" "Tesla"
## [6] "Tesla" "Tesla" "Tesla" "Tesla" "Tesla"
This are the Top 10 car models for our model, as we can see, most of them are electric, which means that the electric cars are having the best statistics in our first component.
## [1] "Karma" "Tesla" "Fisker" "Lucid"
## [5] "Rivian" "Kandi" "BYD" "Polestar"
## [9] "Azure Dynamics" "CODA Automotive"
In case we apply PCA to the brands instead of the models, the Top 6 ones are Karma, Tesla, Fisker, Lucid, Rivian and Polestar (the Luxury Volvo brand); all of them are top electric brands that create premium electric cars, so as we were seeing in the models case, Electric cars are Top in our PCA classification.
#Second, Third and Fourth components:
## [1] "P1" "Yukon XL K10 AWD"
## [3] "Yukon XL K10 4WD" "Savana 1500 2WD Conversion (cargo)"
## [5] "X5 xDrive45e" "Clarity Plug-in Hybrid"
## [7] "Tahoe C10 2WD" "Spider Veloce 2000"
## [9] "Model S Long Range Plus" "Tahoe K10 4WD"
## [1] "McLaren Automotive" "GMC" "GMC"
## [4] "GMC" "BMW" "Honda"
## [7] "Chevrolet" "Alfa Romeo" "Tesla"
## [10] "Chevrolet"
## [1] 12 10 13 9 14 32 13 23 23 13
## [1] "Model X Long Range Plus"
## [2] "Model S Long Range"
## [3] "Model 3 Performance AWD"
## [4] "Model S Performance (19in Wheels)"
## [5] "Model S Plaid (21in Wheels)"
## [6] "Model X Performance (20in Wheels)"
## [7] "Model 3 Long Range AWD"
## [8] "Model S Performance (21in Wheels)"
## [9] "Model X Long Range"
## [10] "Model 3 Long Range AWD Performance"
## [11] "Model 3 Long Range Performance AWD (18in)"
## [12] "Model 3 Long Range Performance AWD (20in)"
## [13] "Model X Performance (22in Wheels)"
## [14] "Model 3 Long Range Performance AWD (19in)"
## [15] "Model 3 Long Range"
## [16] "Model 3 Mid Range"
## [17] "Model 3 Long Range AWD"
## [18] "Model 3 Long Range AWD Performance"
## [19] "Model 3 Standard Range Plus RWD"
## [20] "Model S 100D"
## [21] "Model S P100D"
## [22] "Model S Standard Range"
## [23] "Model X 100D"
## [24] "Model X Standard Range"
## [25] "Model X P100D"
## [26] "Model Y Long Range AWD"
## [27] "Model Y Performance AWD"
## [28] "Model S 75D"
## [29] "4040"
## [30] "Model X 75D"
## [31] "Model S AWD - P100D"
## [32] "Model S AWD - 100D"
## [33] "Model X AWD - P100D"
## [34] "Model S AWD - 90D"
## [35] "Model X AWD - 100D"
## [36] "Model S (85 kW-hr battery pack)"
## [37] "Model Y Standard Range RWD"
## [38] "Model S AWD - 85D"
## [39] "Model S (90 kW-hr battery pack)"
## [40] "Model S AWD - P90D"
## [41] "Model S AWD - P85D"
## [42] "Model S (75 kW-hr battery pack)"
## [43] "Model S AWD - 70D"
## [44] "Model X AWD - 90D"
## [45] "Model S AWD (85 kW-hr battery pack)"
## [46] "Model S AWD - 75D"
## [47] "Model X AWD - P90D"
## [48] "Model X AWD - 75D"
## [49] "Model Y Performance AWD (21in Wheels)"
## [50] "Model X AWD - 60D"
## [1] "Tesla" "Tesla" "Tesla" "Tesla" "Tesla" "Tesla" "Tesla" "Tesla" "Tesla"
## [10] "Tesla"
## [1] 3 3 3 3 3 3 3 3 3 3
## [1] 65.50000 66.33333 66.00000 65.33333 65.00000 64.00000 67.33333 64.00000
## [9] 64.00000 67.00000
## [1] "Savana 1500 2WD Conversion (cargo)"
## [2] "Yukon XL K10 AWD"
## [3] "Tundra 2WD FFV"
## [4] "Transit T150 Wagon FFV"
## [5] "Transit T150 Wagon 4WD FFV"
## [6] "Range Rover FFV"
## [7] "Range Rover L FFV"
## [8] "Range Rover Sport FFV"
## [9] "F150 5.0L 4WD FFV GVWR>7599 LBS PAYLOAD PACKAGE"
## [10] "Transit T150 Wagon 2WD FFV"
## [1] "GMC" "GMC" "Toyota" "Ford" "Ford"
## [6] "Land Rover" "Land Rover" "Land Rover" "Ford" "Ford"
## [1] 259.0000 259.0000 257.0000 255.4000 255.0000 252.6667 252.6667 252.6667
## [9] 255.0000 254.0000
## [1] 65.50000 66.33333 66.00000 65.33333 65.00000 64.00000 67.33333 64.00000
## [9] 64.00000 67.00000
At least the first 50 components of the third component are all Electric and Tesla, more exactly, the premium, and high performance versions of Tesla; the ones with more range and power. We can interpret that this third component is classifying as the best cars, Tesla, as they are actually the most complete electric cars.
If we analyze this four components by doing PCA to the brands instead of the models, we can identify that Component 4 (of the PCA brands) contains most of the brands that produce super cars and high class ones, the most powerful cars. (Lamborghini, Ferrari, Mclaren, Koenigsegg, Maserati, Pagani, Porsche and some others). We are not going to use the makers PCA by the moment but it is used just for interpretation purposes.
## [1] "Karma" "Fisker" "Ram" "Ford" "GMC" "Bentley"
## [7] "Mercury" "Chevrolet" "Lincoln" "Hummer"
## [1] "Karma" "Fisker"
## [3] "Azure Dynamics" "Yugo"
## [5] "Panther Car Company Limited" "Daihatsu"
## [7] "Mcevoy Motors" "Bertone"
## [9] "Renault" "Dacia"
## [1] "McLaren Automotive" "Volvo" "Alfa Romeo"
## [4] "Bugatti" "Pagani" "Porsche"
## [7] "RUF Automobile" "Koenigsegg" "Karma"
## [10] "Genesis"
Contribution of variables to second, third and fourth component in the analysis of the models:
We can see that for Component 2, the most important variables are related to the cars that use more than one type of fuel and the fuel cost for each of them. Barrels is the most contributive variable.
The third component is fully related to the emissions of each vehicles in grams/mile and takes into account the year were the cars where manufactured. Relates emissions, that is why Tesla here is the top 1 in almost all of it. Tesla has the lowest emissions as we can see.
Finally, the fourth component is related to the gasoline consumption in cities and highways, that is why the most powerful cars belong to this list, because these cars are the ones with higher consumption. It is the case of Bugatti,
## Which are the best brands? Now I am going to show the contribution of each of the car makers.
## [1] "Karma" "Tesla" "Fisker" "Lucid"
## [5] "Rivian" "Kandi" "BYD" "Polestar"
## [9] "Azure Dynamics" "CODA Automotive"
Again, Karma is the first one, followed by Tesla, Fisker, Lucid, Rivian and Polestar; electric companies.
Finally, let’s make a zoom to see the top-20 makers in contributions:
names_z1 = rownames(data_makers)[order(get_pca_ind(pca_makers)$contrib[,1],decreasing=T)]
fviz_contrib(pca_makers, choice = "ind", axes = 1, top=20)+scale_x_discrete(labels=names_z1)This is the previous graphic, but zoomed to be able to the see the top brands.
Now that we now which are the best brands, let´s find which are the best models in our data set. (In terms of the third PCA component, the emissions.) The Third component is going to be the most important for me because speaks about emissions and save/spend money, the target in this analysis.
## [1] "Model X Long Range Plus" "Model S Long Range"
## [3] "Model 3 Performance AWD" "Model S Performance (19in Wheels)"
## [5] "Model S Plaid (21in Wheels)" "Model X Performance (20in Wheels)"
## [7] "Model 3 Long Range AWD" "Model S Performance (21in Wheels)"
## [9] "Model X Long Range" "Model 3 Long Range AWD Performance"
Again, Karma is the first one, followed by Tesla, Fisker, Lucid, Rivian and Polestar; electric companies.
Finally, let’s make a zoom to see the top-20 makers in contributions:
names_z1 = rownames(data_models)[order(get_pca_ind(pca)$contrib[,3],decreasing=T)]
fviz_contrib(pca, choice = "ind", axes = 3, top=30)+scale_x_discrete(labels=names_z1)So, as proven before, Model X, Model S and Model 3 are the highest and more valuable cars in our data set in terms of consumption and performance (acceleration). This is because Tesla has the fastest cars in the world due to the acceleration provided by the electric energy.
data.frame(z1=-pca$x[,1],z2=pca$x[,3]) %>%
ggplot(aes(z1,z2,label=rownames(data_models),color=data_models$youSaveSpend)) + geom_point(size=0) +
labs(title="PCA in terms of Money Saving", x="Cost", y="FuelType") +
theme_bw() + scale_color_gradient(low="orange", high="darkred")+theme(legend.position="bottom") + geom_text(size=2, hjust=0.6, vjust=0, check_overlap = TRUE) This graph is really interesting because we can see many groups. In the lowest right side, Teslas with highest performance, hybrid cars in the middle top right and fuel conventional cars in the top left side of the screen.
The same but with components 3 and 4, but using colors for the money you Save or Spend played:
data.frame(z1=-pca$x[,4],z2=pca$x[,3]) %>%
ggplot(aes(z1,z2,label=rownames(data_models),color=data_models$barrels08)) + geom_point(size=0) +
labs(title="PCA", x="PC4, fuel consumption", y="FuelType") +
theme_bw() + scale_color_gradient(low="yellow", high="darkred")+theme(legend.position="bottom") + geom_text(size=2, hjust=0.6, vjust=0, check_overlap = TRUE) Now, in the graph we can see the models plotted by Fuel Type and Consumption, with the gradient referring the barrels of fuel used. In the middle in dark red, fuel cars; left top side hybrid and middle-low side for the pure electric ones.
##
## Call:
## factanal(x = data_models, factors = 2, scores = "regression", rotation = "none")
##
## Uniquenesses:
## barrels08 barrelsA08 charge240 co2 co2A
## 0.180 0.528 0.267 0.999 0.934
## co2TailpipeAGpm combinedCD drive eng_dscr fuelCostA08
## 0.912 0.879 0.995 0.931 0.852
## fuelType fuelType1 ghgScore ghgScoreA highway08
## 0.883 0.660 0.424 0.894 0.082
## highway08U highwayA08U highwayCD highwayE mpgData
## 0.480 0.054 0.935 0.356 0.999
## phevBlended rangeHwy rangeHwyA trany UCity
## 0.363 0.375 0.325 0.615 0.527
## UHighway UHighwayA year youSaveSpend sCharger
## 0.519 0.105 0.842 0.371 0.990
## atvType fuelType2 rangeA evMotor mfrCode
## 0.511 0.217 0.636 0.595 0.786
## c240Dscr charge240b c240bDscr startStop phevComb
## 0.548 0.644 0.591 0.839 0.361
##
## Loadings:
## Factor1 Factor2
## barrels08 -0.613 -0.666
## barrelsA08 0.570 -0.383
## charge240 0.556 0.651
## co2
## co2A 0.165 -0.196
## co2TailpipeAGpm 0.153 -0.254
## combinedCD 0.323 -0.127
## drive
## eng_dscr 0.241 -0.103
## fuelCostA08 0.249 -0.293
## fuelType -0.192 -0.284
## fuelType1 -0.196 -0.549
## ghgScore 0.586 0.482
## ghgScoreA 0.247 -0.212
## highway08 0.403 0.870
## highway08U 0.443 0.569
## highwayA08U 0.900 -0.369
## highwayCD 0.239
## highwayE 0.788 0.154
## mpgData
## phevBlended 0.750 -0.273
## rangeHwy 0.275 0.741
## rangeHwyA 0.777 -0.265
## trany -0.402 -0.472
## UCity 0.408 0.554
## UHighway 0.364 0.590
## UHighwayA 0.862 -0.391
## year 0.321 0.235
## youSaveSpend 0.497 0.618
## sCharger
## atvType 0.690 0.109
## fuelType2 0.788 -0.402
## rangeA 0.511 -0.322
## evMotor 0.600 0.212
## mfrCode 0.307 0.346
## c240Dscr 0.228 0.632
## charge240b 0.201 0.562
## c240bDscr 0.216 0.602
## startStop 0.358 0.180
## phevComb 0.761 -0.247
##
## Factor1 Factor2
## SS loadings 9.072 6.923
## Proportion Var 0.227 0.173
## Cumulative Var 0.227 0.400
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 167253.6 on 701 degrees of freedom.
## The p-value is 0
##
## Call:
## factanal(x = data_models, factors = 3, scores = "regression", rotation = "none")
##
## Uniquenesses:
## barrels08 barrelsA08 charge240 co2 co2A
## 0.177 0.132 0.255 0.996 0.586
## co2TailpipeAGpm combinedCD drive eng_dscr fuelCostA08
## 0.082 0.857 0.993 0.933 0.005
## fuelType fuelType1 ghgScore ghgScoreA highway08
## 0.729 0.649 0.428 0.631 0.079
## highway08U highwayA08U highwayCD highwayE mpgData
## 0.488 0.039 0.921 0.322 0.999
## phevBlended rangeHwy rangeHwyA trany UCity
## 0.252 0.346 0.237 0.614 0.527
## UHighway UHighwayA year youSaveSpend sCharger
## 0.525 0.095 0.841 0.380 0.988
## atvType fuelType2 rangeA evMotor mfrCode
## 0.429 0.129 0.334 0.576 0.789
## c240Dscr charge240b c240bDscr startStop phevComb
## 0.526 0.625 0.570 0.838 0.275
##
## Loadings:
## Factor1 Factor2 Factor3
## barrels08 -0.603 -0.675
## barrelsA08 0.326 -0.115 0.865
## charge240 0.529 0.681
## co2
## co2A 0.641
## co2TailpipeAGpm -0.164 0.942
## combinedCD 0.352 -0.136
## drive
## eng_dscr 0.208 0.140
## fuelCostA08 0.994
## fuelType -0.416 -0.308
## fuelType1 -0.204 -0.537 0.142
## ghgScore 0.579 0.486
## ghgScoreA 0.605
## highway08 0.377 0.877 -0.102
## highway08U 0.433 0.565
## highwayA08U 0.874 -0.296 0.330
## highwayCD 0.263
## highwayE 0.804 0.168
## mpgData
## phevBlended 0.812 -0.291
## rangeHwy 0.234 0.773
## rangeHwyA 0.826 -0.271
## trany -0.376 -0.495
## UCity 0.430 0.516 -0.148
## UHighway 0.379 0.557 -0.146
## UHighwayA 0.770 -0.257 0.495
## year 0.277 0.274
## youSaveSpend 0.492 0.610
## sCharger
## atvType 0.554 0.261 0.442
## fuelType2 0.645 -0.221 0.637
## rangeA 0.293 0.757
## evMotor 0.615 0.215
## mfrCode 0.294 0.353
## c240Dscr 0.193 0.660
## charge240b 0.169 0.588
## c240bDscr 0.183 0.629
## startStop 0.361 0.177
## phevComb 0.809 -0.255
##
## Factor1 Factor2 Factor3
## SS loadings 8.090 6.538 5.173
## Proportion Var 0.202 0.163 0.129
## Cumulative Var 0.202 0.366 0.495
##
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 133536.7 on 663 degrees of freedom.
## The p-value is 0
##
## Call:
## factanal(x = data_models, factors = 4, scores = "regression", rotation = "none")
##
## Uniquenesses:
## barrels08 barrelsA08 charge240 co2 co2A
## 0.221 0.132 0.097 0.485 0.579
## co2TailpipeAGpm combinedCD drive eng_dscr fuelCostA08
## 0.083 0.857 0.989 0.489 0.005
## fuelType fuelType1 ghgScore ghgScoreA highway08
## 0.711 0.627 0.044 0.622 0.155
## highway08U highwayA08U highwayCD highwayE mpgData
## 0.115 0.044 0.921 0.281 0.977
## phevBlended rangeHwy rangeHwyA trany UCity
## 0.249 0.149 0.229 0.548 0.516
## UHighway UHighwayA year youSaveSpend sCharger
## 0.487 0.098 0.352 0.473 0.987
## atvType fuelType2 rangeA evMotor mfrCode
## 0.440 0.124 0.334 0.580 0.460
## c240Dscr charge240b c240bDscr startStop phevComb
## 0.346 0.471 0.394 0.263 0.272
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4
## barrels08 -0.747 0.314 0.345
## barrelsA08 0.257 0.863 0.238
## charge240 0.684 -0.370 -0.546
## co2 0.173 -0.168 0.673
## co2A 0.641
## co2TailpipeAGpm -0.118 0.943 -0.119
## combinedCD 0.267 0.266
## drive
## eng_dscr 0.333 0.138 0.617
## fuelCostA08 0.994
## fuelType -0.218 -0.308 0.365 0.115
## fuelType1 -0.368 0.143 0.392 0.252
## ghgScore 0.855 -0.325 0.345
## ghgScoreA 0.604
## highway08 0.630 -0.104 -0.598 -0.285
## highway08U 0.741 -0.443 0.367
## highwayA08U 0.681 0.325 0.620
## highwayCD 0.203 0.194
## highwayE 0.757 0.205 -0.318
## mpgData -0.135
## phevBlended 0.624 0.596
## rangeHwy 0.454 -0.584 -0.549
## rangeHwyA 0.643 0.584
## trany -0.579 0.342
## UCity 0.599 -0.151 -0.286 0.140
## UHighway 0.578 -0.149 -0.352 0.181
## UHighwayA 0.596 0.490 0.550
## year 0.526 -0.281 0.534
## youSaveSpend 0.648 -0.310
## sCharger
## atvType 0.602 0.439
## fuelType2 0.501 0.633 0.467
## rangeA 0.238 0.755 0.196
## evMotor 0.642
## mfrCode 0.541 -0.326 0.374
## c240Dscr 0.377 -0.505 -0.506
## charge240b 0.332 -0.452 -0.463
## c240bDscr 0.358 -0.484 -0.493
## startStop 0.578 -0.167 0.613
## phevComb 0.629 0.566
##
## Factor1 Factor2 Factor3 Factor4
## SS loadings 9.847 5.153 5.076 3.720
## Proportion Var 0.246 0.129 0.127 0.093
## Cumulative Var 0.246 0.375 0.502 0.595
##
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 104335.8 on 626 degrees of freedom.
## The p-value is 0
## Factor1 Factor2 Factor3 Factor4
## barrels08 -0.74724529 0.055457460 0.314129642 0.34489978 0.22092475
## barrelsA08 0.25690828 0.863205960 0.237700928 -0.02874246 0.13154602
## charge240 0.68415740 -0.010118614 -0.369629396 -0.54595001 0.09713923
## co2 0.17324759 0.059937148 -0.167971380 0.67285619 0.48538759
## co2A -0.01451493 0.641390340 -0.058332975 0.07807517 0.57892342
## co2TailpipeAGpm -0.11756453 0.942964641 -0.118606062 0.00403125 0.08291616
## combinedCD 0.26711012 0.021518625 0.265836319 -0.02964843 0.85654644
## drive -0.07251047 0.044905204 0.024270240 -0.05385881 0.98938637
## eng_dscr 0.33263709 0.138025088 0.004908852 0.61720344 0.48931232
## fuelCostA08 -0.05312042 0.994169626 -0.062025103 0.00476042 0.00500000
## fuelType -0.21774482 -0.307764407 0.365159391 0.11489238 0.71119145
## fuelType1 -0.36756290 0.143281618 0.392323871 0.25182460 0.62718078
## ghgScore 0.85479228 -0.031185790 -0.325286869 0.34474003 0.04370075
## ghgScoreA 0.06215069 0.604263254 0.008310670 0.09503612 0.62201510
## highway08 0.62966845 -0.103818857 -0.597504661 -0.28457954 0.15473295
## highway08U 0.74074344 -0.072712610 -0.443053377 0.36733764 0.11477779
## highwayA08U 0.68088191 0.324811017 0.619838343 -0.05595217 0.04356796
## highwayCD 0.20261197 0.014792746 0.194401979 -0.02420869 0.92058421
## highwayE 0.75663837 0.054826582 0.204609993 -0.31805687 0.28145644
## mpgData -0.05165414 0.012201106 0.045926823 -0.13541126 0.97709808
## phevBlended 0.62351991 0.056216541 0.595534490 -0.06574078 0.24909395
## rangeHwy 0.45375171 -0.043613339 -0.584482683 -0.54884975 0.14934886
## rangeHwyA 0.64319275 0.084762115 0.583553129 -0.09653837 0.22924255
## trany -0.57917062 0.007714867 0.341589087 -0.01236204 0.54769145
## UCity 0.59943567 -0.150583442 -0.286421049 0.13989587 0.51635293
## UHighway 0.57791524 -0.148723373 -0.351574588 0.18143197 0.48740981
## UHighwayA 0.59570742 0.490377642 0.550432715 -0.05927862 0.09817130
## year 0.52646659 0.085591037 -0.281300480 0.53353544 0.35169729
## youSaveSpend 0.64758424 -0.077102259 -0.309914231 -0.07431114 0.47313025
## sCharger 0.06099273 0.006393293 0.080647756 0.05094256 0.98728570
## atvType 0.60211905 0.438635272 -0.005042106 -0.07446114 0.43950068
## fuelType2 0.50136615 0.633189412 0.466687979 -0.07356101 0.12449422
## rangeA 0.23798224 0.755326707 0.196153496 -0.02859456 0.33357383
## evMotor 0.64227246 -0.003364065 0.055758279 -0.06841326 0.57972677
## mfrCode 0.54140859 -0.025642206 -0.326042180 0.37404588 0.46001453
## c240Dscr 0.37705110 -0.034941162 -0.505176744 -0.50572151 0.34561048
## charge240b 0.33155962 -0.030416610 -0.451664154 -0.46315977 0.47061271
## c240bDscr 0.35776946 -0.032896977 -0.483945964 -0.49307846 0.39362485
## startStop 0.57775638 -0.020009849 -0.166638632 0.61263707 0.26268847
## phevComb 0.62927989 0.072123651 0.566294620 -0.07711165 0.27216023
I have decided to choose 4 factors because the total variability explained by them is around 60% whether with 3 factors is just 50% and with 2 is just 40%. The total variability explained by the analysis is quite similar to the one in PCA in a practical way, but not in a scientific point of view.
par(mfrow=c(2,1))
barplot(x.f$loadings[,1], las=1, col="darkgreen")
barplot(x.f$loadings[,2], las=2, col="darkgreen")If we do not apply rotation and use regression we obtain the previous results. I am going to check whether by using varimax rotation and Barlett estimation we can make any improvements:
##
## Call:
## factanal(x = data_models, factors = 2, scores = "Bartlett", rotation = "varimax")
##
## Uniquenesses:
## barrels08 barrelsA08 charge240 co2 co2A
## 0.180 0.528 0.267 0.999 0.934
## co2TailpipeAGpm combinedCD drive eng_dscr fuelCostA08
## 0.912 0.879 0.995 0.931 0.852
## fuelType fuelType1 ghgScore ghgScoreA highway08
## 0.883 0.660 0.424 0.894 0.082
## highway08U highwayA08U highwayCD highwayE mpgData
## 0.480 0.054 0.935 0.356 0.999
## phevBlended rangeHwy rangeHwyA trany UCity
## 0.363 0.375 0.325 0.615 0.527
## UHighway UHighwayA year youSaveSpend sCharger
## 0.519 0.105 0.842 0.371 0.990
## atvType fuelType2 rangeA evMotor mfrCode
## 0.511 0.217 0.636 0.595 0.786
## c240Dscr charge240b c240bDscr startStop phevComb
## 0.548 0.644 0.591 0.839 0.361
##
## Loadings:
## Factor1 Factor2
## barrels08 -0.890 -0.165
## barrelsA08 0.686
## charge240 0.847 0.125
## co2
## co2A 0.244
## co2TailpipeAGpm -0.134 0.265
## combinedCD 0.341
## drive
## eng_dscr 0.259
## fuelCostA08 -0.116 0.367
## fuelType -0.342
## fuelType1 -0.569 0.126
## ghgScore 0.720 0.240
## ghgScoreA 0.322
## highway08 0.951 -0.121
## highway08U 0.718
## highwayA08U 0.166 0.958
## highwayCD 0.251
## highwayE 0.549 0.585
## mpgData
## phevBlended 0.167 0.780
## rangeHwy 0.774 -0.161
## rangeHwyA 0.189 0.799
## trany -0.614
## UCity 0.686
## UHighway 0.693
## UHighwayA 0.127 0.938
## year 0.370 0.147
## youSaveSpend 0.788
## sCharger
## atvType 0.460 0.527
## fuelType2 0.881
## rangeA 0.604
## evMotor 0.499 0.395
## mfrCode 0.457
## c240Dscr 0.657 -0.143
## charge240b 0.583 -0.129
## c240bDscr 0.625 -0.137
## startStop 0.343 0.207
## phevComb 0.195 0.775
##
## Factor1 Factor2
## SS loadings 9.030 6.966
## Proportion Var 0.226 0.174
## Cumulative Var 0.226 0.400
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 167253.6 on 701 degrees of freedom.
## The p-value is 0
##
## Call:
## factanal(x = data_models, factors = 3, scores = "Bartlett", rotation = "varimax")
##
## Uniquenesses:
## barrels08 barrelsA08 charge240 co2 co2A
## 0.177 0.132 0.255 0.996 0.586
## co2TailpipeAGpm combinedCD drive eng_dscr fuelCostA08
## 0.082 0.857 0.993 0.933 0.005
## fuelType fuelType1 ghgScore ghgScoreA highway08
## 0.729 0.649 0.428 0.631 0.079
## highway08U highwayA08U highwayCD highwayE mpgData
## 0.488 0.039 0.921 0.322 0.999
## phevBlended rangeHwy rangeHwyA trany UCity
## 0.252 0.346 0.237 0.614 0.527
## UHighway UHighwayA year youSaveSpend sCharger
## 0.525 0.095 0.841 0.380 0.988
## atvType fuelType2 rangeA evMotor mfrCode
## 0.429 0.129 0.334 0.576 0.789
## c240Dscr charge240b c240bDscr startStop phevComb
## 0.526 0.625 0.570 0.838 0.275
##
## Loadings:
## Factor1 Factor2 Factor3
## barrels08 -0.872 -0.248
## barrelsA08 0.435 0.824
## charge240 0.843 0.184
## co2
## co2A 0.643
## co2TailpipeAGpm 0.953
## combinedCD 0.376
## drive
## eng_dscr 0.230 0.117
## fuelCostA08 0.996
## fuelType -0.384 -0.339
## fuelType1 -0.579 0.109
## ghgScore 0.690 0.309
## ghgScoreA 0.115 0.596
## highway08 0.957
## highway08U 0.699 0.140
## highwayA08U 0.947 0.234
## highwayCD 0.280
## highwayE 0.497 0.657
## mpgData
## phevBlended 0.860
## rangeHwy 0.799 -0.124
## rangeHwyA 0.110 0.867
## trany -0.608 -0.126
## UCity 0.658 0.150 -0.134
## UHighway 0.672 -0.125
## UHighwayA 0.855 0.409
## year 0.362 0.141
## youSaveSpend 0.766 0.174
## sCharger 0.103
## atvType 0.454 0.434 0.420
## fuelType2 0.742 0.563
## rangeA 0.381 0.722
## evMotor 0.460 0.460
## mfrCode 0.446 0.111
## c240Dscr 0.679 -0.112
## charge240b 0.604 -0.102
## c240bDscr 0.647 -0.108
## startStop 0.317 0.246
## phevComb 0.117 0.843
##
## Factor1 Factor2 Factor3
## SS loadings 8.720 6.294 4.787
## Proportion Var 0.218 0.157 0.120
## Cumulative Var 0.218 0.375 0.495
##
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 133536.7 on 663 degrees of freedom.
## The p-value is 0
##
## Call:
## factanal(x = data_models, factors = 4, scores = "Bartlett", rotation = "varimax")
##
## Uniquenesses:
## barrels08 barrelsA08 charge240 co2 co2A
## 0.221 0.132 0.097 0.485 0.579
## co2TailpipeAGpm combinedCD drive eng_dscr fuelCostA08
## 0.083 0.857 0.989 0.489 0.005
## fuelType fuelType1 ghgScore ghgScoreA highway08
## 0.711 0.627 0.044 0.622 0.155
## highway08U highwayA08U highwayCD highwayE mpgData
## 0.115 0.044 0.921 0.281 0.977
## phevBlended rangeHwy rangeHwyA trany UCity
## 0.249 0.149 0.229 0.548 0.516
## UHighway UHighwayA year youSaveSpend sCharger
## 0.487 0.098 0.352 0.473 0.987
## atvType fuelType2 rangeA evMotor mfrCode
## 0.440 0.124 0.334 0.580 0.460
## c240Dscr charge240b c240bDscr startStop phevComb
## 0.346 0.471 0.394 0.263 0.272
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4
## barrels08 -0.799 -0.307 -0.215
## barrelsA08 0.411 0.836
## charge240 0.915 0.254
## co2 -0.208 0.673
## co2A 0.646
## co2TailpipeAGpm 0.949
## combinedCD 0.378
## drive
## eng_dscr -0.195 0.154 0.651 0.160
## fuelCostA08 0.995
## fuelType -0.390 -0.119 -0.340
## fuelType1 -0.588 -0.123 0.110
## ghgScore 0.453 0.282 0.819
## ghgScoreA 0.604
## highway08 0.869 0.295
## highway08U 0.449 0.114 0.819
## highwayA08U 0.937 0.265
## highwayCD 0.281
## highwayE 0.475 0.698
## mpgData -0.149
## phevBlended 0.863
## rangeHwy 0.920
## rangeHwyA 0.873
## trany -0.510 -0.132 -0.417
## UCity 0.418 0.159 0.520 -0.113
## UHighway 0.421 0.563 -0.103
## UHighwayA 0.842 0.436
## year 0.132 0.779 0.134
## youSaveSpend 0.582 0.207 0.378
## sCharger
## atvType 0.355 0.438 0.225 0.438
## fuelType2 0.730 0.585
## rangeA 0.360 0.733
## evMotor 0.356 0.476 0.259
## mfrCode 0.266 0.681
## c240Dscr 0.806
## charge240b 0.724
## c240bDscr 0.775
## startStop 0.180 0.838
## phevComb 0.848
##
## Factor1 Factor2 Factor3 Factor4
## SS loadings 7.375 6.240 5.263 4.918
## Proportion Var 0.184 0.156 0.132 0.123
## Cumulative Var 0.184 0.340 0.472 0.595
##
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 104335.8 on 626 degrees of freedom.
## The p-value is 0
## Factor1 Factor2 Factor3 Factor4
## barrels08 -0.798628056 -0.307125499 -0.21451635 0.0305651177 0.22092475
## barrelsA08 -0.021329536 0.410744660 -0.01965631 0.8360034430 0.13154602
## charge240 0.914750451 0.254011941 0.03747585 0.0128895855 0.09713923
## co2 -0.208423509 -0.091272164 0.67295556 0.0995834965 0.48538759
## co2A -0.046932892 -0.016304969 0.03365100 0.6461344923 0.57892342
## co2TailpipeAGpm -0.035956118 -0.097525625 -0.07741696 0.9488360794 0.08291616
## combinedCD 0.003888822 0.377575540 0.02766935 -0.0038526127 0.85654644
## drive -0.023654285 -0.020132829 -0.09055165 0.0399950493 0.98938637
## eng_dscr -0.194796688 0.154120402 0.65078136 0.1595221869 0.48931232
## fuelCostA08 -0.037175224 -0.009674788 -0.06490492 0.9946740929 0.00500000
## fuelType -0.389736624 0.085728610 -0.11855292 -0.3396682921 0.71119145
## fuelType1 -0.587583602 0.020800324 -0.12340761 0.1097687332 0.62718078
## ghgScore 0.453134277 0.282334825 0.81909900 0.0182437869 0.04370075
## ghgScoreA -0.053367082 0.078554057 0.06754995 0.6037515537 0.62201510
## highway08 0.869166778 0.010095494 0.29536593 -0.0496407743 0.15473295
## highway08U 0.449477916 0.114372973 0.81852107 -0.0115275309 0.11477779
## highwayA08U 0.021045614 0.937030827 0.08795185 0.2650044703 0.04356796
## highwayCD 0.008331399 0.281210712 0.02203155 -0.0037892295 0.92058421
## highwayE 0.475462598 0.697833025 0.06818808 0.0291165654 0.28145644
## mpgData 0.024618400 0.017847172 -0.14943279 0.0027843969 0.97709808
## phevBlended 0.021597021 0.863038115 0.07496079 -0.0004684885 0.24909395
## rangeHwy 0.920363726 -0.058460161 -0.01268112 -0.0010019590 0.14934886
## rangeHwyA 0.056438418 0.873396059 0.06271641 0.0282156567 0.22924255
## trany -0.509647053 -0.132291808 -0.41734351 -0.0302853575 0.54769145
## UCity 0.418293193 0.159260970 0.52012881 -0.1128677439 0.51635293
## UHighway 0.420523995 0.091962536 0.56270753 -0.1033905864 0.48740981
## UHighwayA 0.010941289 0.841980070 0.05347376 0.4357959358 0.09817130
## year 0.131949416 0.080311898 0.77875009 0.1340543502 0.35169729
## youSaveSpend 0.582370501 0.207221775 0.37792329 -0.0442358948 0.47313025
## sCharger -0.045264213 0.093420132 0.04564786 0.0005431575 0.98728570
## atvType 0.355048368 0.437560911 0.22531562 0.4384428297 0.43950068
## fuelType2 0.011842034 0.730122453 0.01156636 0.5849384020 0.12449422
## rangeA -0.002449215 0.359941244 -0.00936958 0.7326638641 0.33357383
## evMotor 0.355994004 0.476122347 0.25854595 -0.0066603496 0.57972677
## mfrCode 0.266001097 0.070209698 0.68102570 0.0226412795 0.46001453
## c240Dscr 0.805766091 -0.056299702 -0.04378380 0.0007315990 0.34561048
## charge240b 0.723902314 -0.052530234 -0.05078688 0.0010702188 0.47061271
## c240bDscr 0.775126649 -0.055052828 -0.05057856 0.0009647229 0.39362485
## startStop 0.049303375 0.180464865 0.83777514 0.0207333179 0.26268847
## phevComb 0.048133846 0.848071061 0.07726202 0.0178882169 0.27216023
For our data set, again the variability explained by 2, 3 and 4 factors is the same as before.
The model does not explain very well the data due to the high number of variables and the complexity they present.
We are going to see how many factors we could need to see at least 75% of the variability explained:
##
## Call:
## factanal(x = data_models, factors = 5, scores = "Bartlett", rotation = "varimax")
##
## Uniquenesses:
## barrels08 barrelsA08 charge240 co2 co2A
## 0.186 0.130 0.058 0.231 0.573
## co2TailpipeAGpm combinedCD drive eng_dscr fuelCostA08
## 0.082 0.855 0.978 0.408 0.005
## fuelType fuelType1 ghgScore ghgScoreA highway08
## 0.585 0.557 0.088 0.621 0.073
## highway08U highwayA08U highwayCD highwayE mpgData
## 0.149 0.048 0.919 0.266 0.848
## phevBlended rangeHwy rangeHwyA trany UCity
## 0.248 0.123 0.223 0.493 0.047
## UHighway UHighwayA year youSaveSpend sCharger
## 0.066 0.101 0.242 0.053 0.964
## atvType fuelType2 rangeA evMotor mfrCode
## 0.433 0.122 0.333 0.574 0.389
## c240Dscr charge240b c240bDscr startStop phevComb
## 0.348 0.479 0.401 0.154 0.269
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4 Factor5
## barrels08 -0.710 -0.320 -0.454
## barrelsA08 0.409 0.836
## charge240 0.913 0.286 0.163
## co2 -0.118 0.858
## co2A 0.643
## co2TailpipeAGpm 0.946 -0.114
## combinedCD 0.378
## drive -0.135
## eng_dscr -0.171 0.147 0.146 0.714
## fuelCostA08 0.994
## fuelType -0.468 -0.322 -0.265 0.133
## fuelType1 -0.627 0.122 -0.187
## ghgScore 0.388 0.286 0.657 0.498
## ghgScoreA 0.604
## highway08 0.758 0.589
## highway08U 0.347 0.111 0.579 0.619
## highwayA08U 0.932 0.269
## highwayCD 0.282
## highwayE 0.450 0.714 0.137
## mpgData -0.305 0.232
## phevBlended 0.862
## rangeHwy 0.929 0.112
## rangeHwyA 0.877
## trany -0.526 -0.151 -0.423 -0.169
## UCity 0.179 0.127 0.115 0.941
## UHighway 0.193 0.170 0.927
## UHighwayA 0.835 0.441
## year 0.153 0.118 0.819 0.209
## youSaveSpend 0.359 0.186 0.885
## sCharger 0.124 -0.106
## atvType 0.304 0.443 0.441 0.161 0.241
## fuelType2 0.730 0.587
## rangeA 0.359 0.733
## evMotor 0.308 0.484 0.200 0.239
## mfrCode 0.275 0.692 0.224
## c240Dscr 0.804
## charge240b 0.719
## c240bDscr 0.771
## startStop 0.177 0.861 0.266
## phevComb 0.848 0.104
##
## Factor1 Factor2 Factor3 Factor4 Factor5
## SS loadings 6.438 6.249 4.879 4.453 4.257
## Proportion Var 0.161 0.156 0.122 0.111 0.106
## Cumulative Var 0.161 0.317 0.439 0.550 0.657
##
## Test of the hypothesis that 5 factors are sufficient.
## The chi square statistic is 76675.99 on 590 degrees of freedom.
## The p-value is 0
##
## Call:
## factanal(x = data_models, factors = 7, scores = "Bartlett", rotation = "varimax")
##
## Uniquenesses:
## barrels08 barrelsA08 charge240 co2 co2A
## 0.135 0.131 0.054 0.242 0.572
## co2TailpipeAGpm combinedCD drive eng_dscr fuelCostA08
## 0.079 0.855 0.969 0.404 0.005
## fuelType fuelType1 ghgScore ghgScoreA highway08
## 0.005 0.126 0.078 0.609 0.068
## highway08U highwayA08U highwayCD highwayE mpgData
## 0.144 0.036 0.920 0.226 0.846
## phevBlended rangeHwy rangeHwyA trany UCity
## 0.253 0.124 0.229 0.471 0.044
## UHighway UHighwayA year youSaveSpend sCharger
## 0.056 0.090 0.243 0.028 0.961
## atvType fuelType2 rangeA evMotor mfrCode
## 0.418 0.127 0.332 0.565 0.376
## c240Dscr charge240b c240bDscr startStop phevComb
## 0.069 0.190 0.051 0.163 0.275
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6 Factor7
## barrels08 -0.323 -0.462 -0.529 0.319 -0.402
## barrelsA08 0.402 0.839
## charge240 0.288 0.700 0.251 -0.268 0.488
## co2 -0.106 -0.113 0.845
## co2A 0.644
## co2TailpipeAGpm -0.108 0.946 -0.103
## combinedCD 0.377
## drive -0.128
## eng_dscr 0.147 -0.151 0.147 0.714 -0.129
## fuelCostA08 0.994
## fuelType -0.235 -0.283 -0.220 0.897
## fuelType1 -0.360 0.161 -0.151 0.817 -0.132
## ghgScore 0.285 0.249 0.503 0.693 0.195
## ghgScoreA 0.605 -0.105
## highway08 0.547 0.660 -0.265 0.337
## highway08U 0.112 0.209 0.622 0.618 -0.106 0.138
## highwayA08U 0.935 0.275
## highwayCD 0.280
## highwayE 0.714 0.251 0.186 -0.203 0.350
## mpgData 0.235 -0.293
## phevBlended 0.859
## rangeHwy 0.817 0.197 -0.211 0.352
## rangeHwyA 0.868
## trany -0.151 -0.336 -0.206 -0.436 0.207 -0.343
## UCity 0.129 0.104 0.946 0.156
## UHighway 0.108 0.933 0.209
## UHighwayA 0.837 0.447
## year 0.109 0.192 0.826 -0.112
## youSaveSpend 0.181 0.248 0.918 0.180
## sCharger -0.110 0.113
## atvType 0.440 0.172 0.432 0.274 0.165 -0.189 0.187
## fuelType2 0.722 0.592
## rangeA 0.351 0.736
## evMotor 0.480 0.183 0.260 0.211 0.223
## mfrCode 0.227 0.217 0.715
## c240Dscr 0.954 0.121
## charge240b 0.890
## c240bDscr 0.963 0.105
## startStop 0.177 0.234 0.866
## phevComb 0.841 0.101
##
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6 Factor7
## SS loadings 6.198 5.004 4.890 4.622 4.592 1.972 1.152
## Proportion Var 0.155 0.125 0.122 0.116 0.115 0.049 0.029
## Cumulative Var 0.155 0.280 0.402 0.518 0.633 0.682 0.711
##
## Test of the hypothesis that 7 factors are sufficient.
## The chi square statistic is 54907.25 on 521 degrees of freedom.
## The p-value is 0
##
## Call:
## factanal(x = data_models, factors = 10, scores = "Bartlett", rotation = "varimax")
##
## Uniquenesses:
## barrels08 barrelsA08 charge240 co2 co2A
## 0.140 0.048 0.027 0.208 0.560
## co2TailpipeAGpm combinedCD drive eng_dscr fuelCostA08
## 0.016 0.755 0.957 0.401 0.005
## fuelType fuelType1 ghgScore ghgScoreA highway08
## 0.005 0.124 0.089 0.450 0.072
## highway08U highwayA08U highwayCD highwayE mpgData
## 0.153 0.030 0.866 0.195 0.838
## phevBlended rangeHwy rangeHwyA trany UCity
## 0.022 0.111 0.107 0.468 0.049
## UHighway UHighwayA year youSaveSpend sCharger
## 0.047 0.054 0.241 0.029 0.959
## atvType fuelType2 rangeA evMotor mfrCode
## 0.005 0.120 0.254 0.196 0.378
## c240Dscr charge240b c240bDscr startStop phevComb
## 0.070 0.189 0.048 0.119 0.050
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6 Factor7 Factor8
## barrels08 -0.457 -0.543 -0.277 -0.117 0.311 -0.216
## barrelsA08 0.849 0.199 0.423
## charge240 0.696 0.270 0.217 0.125 -0.264 0.142
## co2 -0.105 0.860 -0.121
## co2A 0.649 0.100
## co2TailpipeAGpm 0.957
## combinedCD 0.116 0.475
## drive -0.117 -0.107
## eng_dscr -0.149 0.138 0.722 0.106
## fuelCostA08 0.992
## fuelType -0.229 -0.278 -0.213 0.900
## fuelType1 -0.351 0.164 -0.144 -0.114 0.811 -0.129
## ghgScore 0.246 0.694 0.507 0.202 0.146 0.130
## ghgScoreA 0.593 0.104
## highway08 0.543 0.675 -0.247 0.104
## highway08U 0.207 0.613 0.627
## highwayA08U 0.266 0.106 0.741 0.499 0.118
## highwayCD 0.100 0.347
## highwayE 0.245 0.209 0.430 0.521 -0.187 0.205
## mpgData -0.296 0.231
## phevBlended 0.494 0.841 0.103
## rangeHwy 0.814 0.210 -0.206
## rangeHwyA 0.875 0.312
## trany -0.329 -0.437 -0.209 -0.127 0.206 -0.201
## UCity 0.102 0.165 0.939
## UHighway 0.107 0.218 0.937
## UHighwayA 0.432 0.684 0.402 0.114
## year 0.102 0.829 0.192 -0.113
## youSaveSpend 0.245 0.916 0.149 0.152
## sCharger 0.113 -0.107 0.105
## atvType 0.155 0.415 0.143 0.243 0.277 0.207 -0.165 0.756
## fuelType2 0.590 0.576 0.434
## rangeA 0.724 0.463
## evMotor 0.167 0.189 0.232 0.228 0.341 0.710
## mfrCode 0.225 0.713 0.216
## c240Dscr 0.952 0.121
## charge240b 0.889
## c240bDscr 0.964 0.103
## startStop 0.874 0.214 0.106 0.227
## phevComb 0.104 0.924 0.247
## Factor9 Factor10
## barrels08 -0.331
## barrelsA08
## charge240 0.511
## co2
## co2A
## co2TailpipeAGpm -0.230
## combinedCD
## drive
## eng_dscr -0.121
## fuelCostA08
## fuelType
## fuelType1 -0.124
## ghgScore 0.164
## ghgScoreA 0.426
## highway08 0.303
## highway08U
## highwayA08U 0.261
## highwayCD
## highwayE 0.396
## mpgData
## phevBlended
## rangeHwy 0.360
## rangeHwyA
## trany -0.300
## UCity
## UHighway
## UHighwayA 0.324
## year
## youSaveSpend 0.136
## sCharger
## atvType
## fuelType2
## rangeA
## evMotor 0.104
## mfrCode
## c240Dscr
## charge240b
## c240bDscr
## startStop
## phevComb -0.125
##
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6 Factor7 Factor8
## SS loadings 4.949 4.866 4.650 4.645 4.075 2.524 1.927 1.464
## Proportion Var 0.124 0.122 0.116 0.116 0.102 0.063 0.048 0.037
## Cumulative Var 0.124 0.245 0.362 0.478 0.580 0.643 0.691 0.727
## Factor9 Factor10
## SS loadings 0.985 0.463
## Proportion Var 0.025 0.012
## Cumulative Var 0.752 0.764
##
## Test of the hypothesis that 10 factors are sufficient.
## The chi square statistic is 31785.23 on 425 degrees of freedom.
## The p-value is 0
As we can see, in order to explain at least 75% of the variability of the data, we would need 10 different factors, which is not affordable due to the complexity to understand each of them.
As in PCA, when you make clustering, the groups and the number assigned to each of them, could change, so depending on it, the annotations could not be super precise in relation with the K number. By establishing a seed we could avoid this kind of problems
Let’s start creating 5 clusters (the typical number of different positions) using k-means
set.seed(9) #Remove this In order to make a real Clustering trial. It is used just for explanations.
data_models_SC = scale(data_models)
fit = kmeans(data_models_SC, centers=5, nstart=100) # nstart is the number of times we do the kmeans, centers is K
groups = fit$clusterHere I am assigning each car model to one of the groups.
Let´s see how many models do we have in each of the clusters:
The first and the last clusters seem to have the majority of them
First, center´s meaning:
## barrels08 barrelsA08 charge240 co2 co2A co2TailpipeAGpm
## 1 0.38662550 3.7626363 -0.2037590 0.4082014 3.1485089 4.3495137
## 2 -2.21603021 3.0690640 1.9950287 -0.2729710 -0.1462308 -0.2148291
## 3 0.26977303 -0.2225664 -0.2037590 -0.6987158 -0.1393952 -0.1658897
## 4 -4.50474801 -0.2669753 5.2597928 -0.7724202 -0.1462308 -0.2148291
## 5 0.04622149 -0.2389680 -0.2032223 1.0182542 -0.1159731 -0.1961982
## combinedCD drive eng_dscr fuelCostA08 fuelType fuelType1
## 1 -0.05858332 0.225532275 0.5718643 4.5625079 -1.49773863 0.66200844
## 2 2.62442159 -0.153410684 1.4097594 0.1261668 0.34352388 -0.05948843
## 3 -0.05858332 0.083757941 -0.6272799 -0.1875849 0.29813851 0.20864531
## 4 -0.05858332 0.001752704 -1.0068433 -0.2390490 -2.52474714 -3.65892509
## 5 -0.05745680 -0.133466593 0.8210605 -0.2057984 -0.09184497 -0.08443218
## ghgScore ghgScoreA highway08 highway08U highwayA08U highwayCD
## 1 -0.2489816 3.0393538 -0.42658284 -0.3360163 1.0446783 -0.04384815
## 2 2.3821329 -0.1540123 0.29359966 1.1434047 6.1619320 1.96413698
## 3 -0.7280664 -0.1471330 -0.30545165 -0.6762057 -0.1951048 -0.04384815
## 4 2.5814711 -0.1540123 5.01488524 2.5260372 -0.2010105 -0.04384815
## 5 0.7227389 -0.0924807 0.08030791 0.7340469 -0.1735915 -0.04299481
## highwayE mpgData phevBlended rangeHwy rangeHwyA trany
## 1 -0.2106666 0.03662301 -0.1318200 -0.1453055 -0.1311230 0.1310903
## 2 5.0333485 -0.04269988 5.9178854 -0.1453055 5.8978286 -1.2160484
## 3 -0.2105098 0.19332233 -0.1318200 -0.1453055 -0.1311230 0.4746647
## 4 2.8058613 -0.18623065 -0.1318200 4.9453301 -0.1311230 -3.1230355
## 5 -0.1819274 -0.26046762 -0.1300094 -0.1451681 -0.1299677 -0.3783104
## UCity UHighway UHighwayA year youSaveSpend sCharger
## 1 -0.7201335 -0.6557946 1.8415789 0.3498367 -0.4051138 -0.01466338
## 2 1.2434745 0.7660015 5.4898088 1.0478171 1.5167692 0.62575390
## 3 -0.3716346 -0.3981625 -0.2071720 -0.7561242 -0.2921955 -0.04651189
## 4 1.9050051 1.9657748 -0.2293923 0.9687277 2.9123035 -0.20257025
## 5 0.3855048 0.4390401 -0.1998924 0.8978773 0.1480873 0.04662419
## atvType fuelType2 rangeA evMotor mfrCode c240Dscr charge240b
## 1 1.7787656 2.4605687 3.3137950 -0.3053130 -0.1516983 -0.114641 -0.09933160
## 2 3.1764181 5.1552018 2.4337156 3.4702798 0.9652521 -0.114641 -0.09822765
## 3 -0.3677891 -0.2192076 -0.1940682 -0.3007774 -0.6579559 -0.114641 -0.09933160
## 4 1.8151585 -0.2468859 -0.2290324 1.9185158 1.5653070 3.903126 3.38105592
## 5 0.0119541 -0.2276790 -0.1982005 0.1125824 0.7717314 -0.114641 -0.09933160
## c240bDscr startStop phevComb
## 1 -0.1076115 -0.1764950 -0.1246706
## 2 -0.1076115 1.6723000 5.6140147
## 3 -0.1076115 -0.7918979 -0.1246706
## 4 3.6637963 0.4129425 -0.1246706
## 5 -0.1076115 1.0101444 -0.1239406
As in PCA, we need to understand which is each of the groups
i=3
bar1 = barplot(centers[i,], las = 2, col="darkgreen",
main=paste("Cluster", i,": Group center in green, global center in red"))
points(bar1,y = apply(data_models_SC, 2, quantile, 0.5),col="red",pch=19) I am plotting the centers of the third group (the biggest one) in green color and the average of all the models for each of the variables in red color. The most representative variables for this huge group are fuelCost, fuelType, barrels and range.
i=4
bar2 = barplot(centers[i,], las = 2, col="darkgreen", main=paste("Cluster", i,": Group center in green, global center in red"))
points(bar2,y = apply(data_models_SC, 2, quantile, 0.50),col="red",pch=19) Now I am plotting the centers and the mean for the smallest group, the number 2. For this case, the most representative variables are year, starStop, emissions and the given score.
This is a really representative way of seeing each of the clusters. As we have set K to 5 we have 5 different groups, and it is quite easy to guess what are the characteristics of them.
fviz_cluster(fit, data = data_models_SC, geom = c("point"), ellipse.type = 'norm', pointsize = 1) +
theme_minimal() + geom_text(label = rownames(data_models), hjust=0, vjust=0, size=2, check_overlap = T) + scale_fill_brewer(palette = "Paired")I am going to plot also the Makers to be able to understand better each group
fviz_cluster(fit, data = data_models_SC, geom = c("point"), ellipse.type = 'norm', pointsize = 1) +
theme_minimal() + geom_text(label = car_model_def$make, hjust=0, vjust=0, size=2, check_overlap = T) + scale_fill_brewer(palette = "Paired")The size of the ellipsoid is correlated to how far are each of the models between them, so as smaller the ellipsoids are, as closer and more similar the models will be.
In the provided cluster (I remark that I am using a seed so that I always have the same output and I can explain it easier) there are 5 groups.
Group number 1 in red color is composed mainly by Ford and Chevrolet, and most of the models are hybrid ones.
Group number 2 is the smallest one. Is located in the left-down side and it is composed by most of the High-Performance Hybrid cars and some of the Top electric ones as Karma or Riviera. This models belong to Porsche, BMW, Mercedes and Volvo. What means that they are usual high power vehicles with an small electric motor that allows them to be seen as less contaminant.
Group number 3 is the biggest one and it is the one with smallest size in the plot, what means that all the vehicles are quite similar. This group is the one for the fuel cars, all with related consumpsions and emissions.
Group number 4 is the Top group. It is composed by Tesla, Nissan, and some others and to it belongs all the Electric-High-Power vehicles.
Finally, group number 5 is quite similar to group 3, is the second one with more cars and 90% of them are fuel cars.
“The silhouette measures the cohesion of a data point to its cluster relative to other clusters (separation). Silhouette plots rely on a distance metric and suggest that the data matches its own cluster well. The larger the silhouette widths, the better.” (This definition is taken directly from class practices)
d = dist(data_models_SC, method="euclidean")
sil = silhouette(groups, d)
plot(sil, col=1:5, main="", border=NA)## Silhouette of 4521 units in 5 clusters from silhouette.default(x = groups, dist = d) :
## Cluster sizes and average silhouette widths:
## 180 98 2409 129 1705
## 0.1986460 0.2720201 0.4071593 0.2823921 0.2009797
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.2378 0.2119 0.3333 0.3146 0.4448 0.5502
The larger the line, the better, we are using this graph to select the number of groups that we use, the larger are the lines while we are adding them, the better the selection groups is.
We have analyzed the data by separating it into 5 groups, but, could we know which is the real number of groups? Well, there is no a real answer to this but we can guess and see which is the most optimal K in terms of efficiency and cost.
#I am going to set a seed again to that I always get the same output and the written answer coincide. In real practice just remove it!.
set.seed(1)
fit = kmeans(data_models_SC, centers=3, nstart=100)
groups = fit$cluster
barplot(table(groups), col="darkgreen")Plotting biggest group
centers=fit$centers
i=1 # plotting the centers in cluster 1
bar1 = barplot(centers[i,], las=2, col="darkgreen",
main=paste("Cluster", i,": Group center in green, global center in red"))
points(bar1, y = apply(data_models_SC, 2, quantile, 0.50),col="red",pch=19)i=2 # plotting the centers in cluster, in this seed() the second group is the one with the good models
bar2=barplot(centers[i,], las=2, col="darkgreen", main=paste("Cluster", i,": Group center in green, global center in red"))
points(bar2, y = apply(data_models_SC, 2, quantile, 0.50),col="red",pch=19)Clusplot
fviz_cluster(fit, data = data_models_SC, geom = c("point"), ellipse.type = 'norm', pointsize = 1) +
theme_minimal() + geom_text(label = rownames(data_models), hjust=0, vjust=0, size=2, check_overlap = T) + scale_fill_brewer(palette = "Paired")I am going to plot also the Makers to be able to understand better each group
fviz_cluster(fit, data = data_models_SC, geom = c("point"), ellipse.type = 'norm', pointsize = 1) +
theme_minimal() + geom_text(label = car_model_def$make, hjust=0, vjust=0, size=2, check_overlap = T) + scale_fill_brewer(palette = "Paired") Now we have more or less a similar output. Electric cars in group 3, fuel cars in group 2 and Hybrid ones in the second. This output is easier to handle but it is not as exact as if we use 5 centers.
d <- dist(data_models_SC, method = "euclidean")
sil = silhouette(groups,d)
plot(sil, col = 1:2, main="", border = NA)## Silhouette of 4521 units in 3 clusters from silhouette.default(x = groups, dist = d) :
## Cluster sizes and average silhouette widths:
## 248 4143 130
## -0.01286599 0.68127560 0.31054170
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.3425 0.6417 0.6999 0.6325 0.7265 0.7617
The negative side represents the worst performance vehicles on each group. From a statistical point of view it is better to select a smaller number of groups.
In terms of Silhouette plot, it seems 3 clusters is better than 5 because the average width is higher, but despite we know what each group is composed by, it is difficult to guess the best value for K.
Based on wss (total within sum of square), we can guess which could be the best values for K.
WSS is the minimum sum of squares, which are the distances between the models, the closer the models, the better. It is another choice.
This plot returns the k-means with 1, 2,…10 centers, then the silhouette plot and then the average of the width for all of them. Here we can see that the answer is K = 4.
Based on the gap statistic (using bootstrap)
# As it is time-consuming I have run this code just once to check the output
# fviz_nbclust(data_models_SC, kmeans, method = 'gap_stat', k.max = 15)In this plot we compute the so called gap statistic, but IT IS TIME CONSUMER. We could select 3 or 4 because 11 clusters is too much. As the final number of groups is subjective, we cannot answer clearly all of them. We keep more than 2 and less than 5. 4 clusters for this case. (Despite that the Gap statistic recommends K = 11)
As we already can understand what each of the variables mean, and I have not keep any profile variable, we are not going to check any difference with their use. Let´s try other types of clustering:
K-means only uses the euclidean distance and we cannot change it.
(x - mean of x)/s = (S2)-1/2 * (x - mean of x). This new second matrix is what we use in order to incorporate all the information about correlations. Thanks to the code below, we can use a better k-means, the one by computing Mahalanobis distances.
# %*% Is the multiplication of matrices in R.
S_x = cov(data_models)
iS = solve(S_x) #this solve is the inverse
e = eigen(iS)
V = e$vectors
B = V %*% diag(sqrt(e$values)) %*% t(V)
# To compute the square root we are getting the eigenvalues and eigenvectors. It is the sqrt of the diagonal. Now we have the "standard deviation".
Xtil = scale(data_models, scale = FALSE) #this is standardization without diving over S. Only using the denominator.
model_S = Xtil %*% B Take care: here we are assuming all the clusters have the same covariance matrix, that is, the same shape and orientations as the whole data
If this is not the case, we need mixture models (probabilistic clusters)
kmeans with 3 clusters assuming an elliptic distribution
fit.mahalanobis = kmeans(model_S, centers = 4, nstart = 100)
groups = fit.mahalanobis$cluster
centers = fit.mahalanobis$centers
colnames(centers) = colnames(data_models_SC)
centers## barrels08 barrelsA08 charge240 co2 co2A co2TailpipeAGpm
## 1 -2.4904392 0.28034169 0.23405702 -0.92032790 0.02960461 0.132161131
## 2 0.1022667 0.01109465 0.01194225 0.01631830 -0.01366634 -0.004532019
## 3 -4.5912731 0.01982986 0.08460714 -0.66812195 0.26056957 0.111980473
## 4 0.3507377 -0.24766067 -0.28909914 0.07465527 0.14124544 0.024003102
## combinedCD drive eng_dscr fuelCostA08 fuelType fuelType1
## 1 0.020214866 -6.531169e-02 -1.72518054 -0.04172696 -0.134563475 -0.19949171
## 2 -0.001025359 -2.187297e-05 0.01280824 -0.01408970 0.003576829 -0.02492978
## 3 -0.011246000 -8.184453e-02 -1.06840227 0.39343729 -0.562781640 -0.26110055
## 4 0.022931929 4.357963e-02 0.39764102 0.09557597 0.199271556 0.61534403
## ghgScore ghgScoreA highway08 highway08U highwayA08U highwayCD
## 1 0.039432393 -0.20007222 2.364509905 2.25706767 -0.40746317 -0.009511554
## 2 0.003956334 -0.01671383 0.004347652 -0.07760876 0.05688317 0.001214573
## 3 -0.350900378 0.36684729 0.539807891 1.28254588 -0.80443529 0.070807490
## 4 0.079803613 0.17254240 -0.545512646 0.70000759 -0.68904049 -0.054593961
## highwayE mpgData phevBlended rangeHwy rangeHwyA trany
## 1 -0.66353731 0.1088441780 -0.182536207 5.82399784 0.074243510 -0.39340594
## 2 0.02108989 0.0002596127 0.004252623 0.01502291 0.023172089 0.01657078
## 3 1.82390337 -0.0828097394 0.019838016 -0.89786950 -0.982978749 -0.33733313
## 4 -1.17218253 0.0226378957 -0.073881721 -0.41489327 -0.005146771 -0.12875246
## UCity UHighway UHighwayA year youSaveSpend sCharger
## 1 1.0279791 0.97933912 -0.31467974 0.15647976 0.2808307209 0.01559382
## 2 -0.0884755 -0.01231199 0.02952345 -0.01703224 -0.0001943665 -0.00207266
## 3 0.6948003 0.72784162 -0.31311889 0.11570277 0.0654275991 -0.05824204
## 4 1.2882416 -0.18408135 -0.39556773 0.25999415 -0.0517464102 0.06479859
## atvType fuelType2 rangeA evMotor mfrCode c240Dscr
## 1 -0.12371655 0.073624973 -0.087340370 0.3787990 0.993917877 -2.09895347
## 2 -0.10803147 -0.004777171 -0.003779981 -0.1652373 0.006384619 0.06826115
## 3 0.01356294 -0.163291563 -0.086933228 0.3719890 -0.590575924 -2.33267422
## 4 2.07860216 0.159087388 0.120039623 2.9676607 0.054474373 -0.05821474
## charge240b c240bDscr startStop phevComb
## 1 11.39316094 2.00885116 0.02467939 -0.35178378
## 2 -0.01948732 0.02102996 -0.01212590 0.02267156
## 3 -1.42490731 -1.25604038 -0.12662599 -0.36948177
## 4 -0.02413963 -0.01751542 0.28796853 -0.23506166
i=1 # plotting the centers in cluster 1
bar1 = barplot(centers[i,], las = 2, col = "darkgreen",
main=paste("Cluster", i,": Group center in green, global center in red"))
points(bar1, y = apply(data_models_SC, 2, quantile, 0.50), col = "red", pch = 19)i=2 # plotting the centers in cluster 2
bar2 = barplot(centers[i,], las=2, col = "darkgreen",
main=paste("Cluster", i,": Group center in green, global center in red"))
points(bar2, y = apply(data_models_SC, 2, quantile, 0.50), col = "red",pch = 19)i=3 # plotting the centers in cluster 3
bar3 = barplot(centers[i,], las=2, col = "darkgreen",
main=paste("Cluster", i,": Group center in green, global center in red"))
points(bar3, y = apply(data_models_SC, 2, quantile, 0.50), col = "red",pch = 19)i=4 # plotting the centers in cluster 4
bar4 = barplot(centers[i,], las = 2, col = "darkgreen",
main = paste("Cluster", i,": Group center in green, global center in red"))
points(bar4, y = apply(data_models_SC, 2, quantile, 0.50), col = "red",pch = 19)#Clusplot
fviz_cluster(fit.mahalanobis, data = data_models, geom = c("point"), ellipse.type = 'norm', pointsize = 1)+
theme_minimal()+geom_text(label = rownames(data_models), hjust = 0, vjust = 0,
size = 2, check_overlap = T) + scale_fill_brewer(palette = "Paired")Different insights now…
The Adjusted Rand index compares two classifications: the closer to 1, the more agreement
## [1] 0.1911303
Here we have advance tools and one of them, the mclust library, compares two different clusters and how related they are. If 0 they are quite different, opposite if 1. For this case the output is 0.19 what suggests that the clusters are quite different. As we could see in the previous plot this is true.
Group number 1 in red color is composed by the Top cars, just Tesla brand.
Group number 2 is the previous number 2 and the number 4 together. Hybrid and low consume fuel vehicles.
Group number 3 is a new group, and it relates most of the electric vehicles that are not Tesla and are behind them, but that represent a higher performance than fuel cars does.
Group number 4 is the fuel group. Contains same fuel cars from last clustering and represents the ones with more emissions.
Despite that Mahalanobis is more complex I do not end to like more this result. The easier and original one was easier to interpret and had more sense. Here as the distance used is different, it is more complex and the logic behind it is more difficult to interpret.
Partitioning (clustering) of the data into k clusters around medoids
More robust version than k-means, the centers are now models
fit.pam <- eclust(data_models, "pam", stand=TRUE, k = 4, graph = F)
fviz_cluster(fit.pam, data = data_models, geom = c("point"), pointsize = 1)+
theme_minimal() +
geom_text(label = rownames(data_models), hjust = 0, vjust = 0, size = 2, check_overlap = T) + scale_fill_brewer(palette = "Paired") Here we can see that the output is quite similar as in k-means clustering, what is something good because is saying that our input is robust.
Check the number of each of the groups by using PAM:
For the silhouette method we obtain that K = 5 is the optimal number, something better than the K = 8 provided by Mahalanobis before. It has been a bit time consuming but it was optimal, nevertheless, the gap statistic, as before is QUITE HIGHLY time consuming if you take into account that it is also doing bootstrapping. My Pc has 12 nucleus and it has take him almost 1 hour to end this process. I am going to add a hashtag to it so that you do not need to run it if do not want.
Now, as before we are going to compare the similarity of this 2 clusters by computing the adjusted Rand index comparing two classifications. Again, 0 means totally different and 1 totally equal.
## [1] 0.218582
The returned number, 0.21, means that they are not similar as happened before.
Try to capture clusters that are not linearly separable in the input space
library(kernlab)
#In order to use this library, we need to transform our data into a matrix:
fit.ker <- kkmeans(as.matrix(data_models), centers=5, kernel="rbfdot") # Radial Basis kernel (Gaussian)## Using automatic sigma estimation (sigest) for RBF or laplace kernel
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 178.7154 4.844199 8.493221 142.06503 1.055427 1.071696 1.059817 5.879185
## [2,] 255.2294 8.226586 1.000000 87.64549 4.558567 17.666872 1.000000 6.421855
## [3,] 252.0382 5.337158 1.103641 137.07834 4.645965 7.691640 1.012605 6.125749
## [4,] 247.3670 4.427302 1.066194 91.69005 2.738292 5.539806 1.000000 6.068481
## [5,] 233.9985 4.334730 1.536220 126.90646 1.768358 2.646673 1.013117 5.670345
## [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
## [1,] 298.6498 1.145683 8.684257 4.799908 7.799232 1.035447 35.00732 3957.67696
## [2,] 146.3781 3.813816 9.009213 5.384978 1.334277 1.073857 8.68301 74.75341
## [3,] 184.0418 2.420149 9.572598 5.449131 2.079573 1.132829 13.08981 346.60149
## [4,] 162.4336 2.248068 9.624700 5.395270 1.995927 1.080676 15.58587 451.01763
## [5,] 234.0160 1.844753 9.801994 5.403649 3.284671 1.097058 19.66623 1231.72648
## [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
## [1,] 45.29633 1.042430 19.654899 2.233551 1.061606 13.75561 6.779879 20.42419
## [2,] 3.85281 1.000000 1.000000 2.131644 1.000000 1.00000 1.000000 29.62805
## [3,] 10.54720 1.011204 1.922549 2.200541 1.005882 1.00000 1.171989 29.84911
## [4,] 9.00173 1.000000 1.683512 2.298259 1.002786 1.00000 1.217431 30.43725
## [5,] 23.89165 1.004284 6.011942 2.247647 1.017918 1.00000 2.896755 29.19233
## [,25] [,26] [,27] [,28] [,29] [,30] [,31] [,32]
## [1,] 7100.9782 7310.6409 46.093096 34.79413 45.35824 1.029650 3.640667 1.151911
## [2,] 656.6584 683.6113 8.671852 18.95260 13.17431 1.046470 1.644305 1.106952
## [3,] 2138.9194 2291.2250 16.064995 22.16372 19.94134 1.044992 1.451508 1.060322
## [4,] 3351.2516 3476.6734 17.017181 20.64532 24.77916 1.029367 1.521197 1.051143
## [5,] 5244.8909 5196.9372 27.786567 24.85244 33.08623 1.027084 1.928350 1.089476
## [,33] [,34] [,35] [,36] [,37] [,38] [,39] [,40]
## [1,] 6.803259 45.319873 26.594821 1.377571 1.603686 1.319425 2.446313 3.113351
## [2,] 10.027943 2.226376 6.262037 1.000000 1.000000 1.000000 1.243497 1.000000
## [3,] 7.422252 3.964190 9.329845 1.000000 1.000000 1.000000 1.533408 1.021289
## [4,] 5.045240 4.143229 6.912586 1.000000 1.000000 1.000000 1.437609 1.022977
## [5,] 5.415849 14.670129 13.492790 1.000000 1.000000 1.000000 1.792422 1.353838
## [1] 764 963 714 913 1167
## [1] 170178774170 2366538250 14008254733 41391334685 126157569829
object.ker = list(data = data_models, cluster = fit.ker@.Data)
fviz_cluster(object.ker, geom = c("point"), ellipse = F, pointsize = 1) +
theme_minimal() + geom_text(label = rownames(data_models), hjust = 0,
vjust = 0, size = 2, check_overlap = T) + scale_fill_brewer(palette = "Paired") This kernel, rbfdot instead of Gaussian, has returned us a different output than before. We have used 5 centers and it has mixed some of the data. All the electric cars are combined with some of the fuel ones and the hybrids from the lowest left side, and then the other groups are quite difficult to interpret, the group number 2 represents some of the Ford and Chevrolet Hybrid models. It seems that the groups are classified taking really into account the second dimension.
Important to decide distance between observations and linkage to join groups
First of all, as stated, we are going to choose a proper distance and a cluster.
d = dist(data_models_SC, method = "euclidean") #euclidean for the models
hc = hclust(d, method = "ward.D2") #distance between both, this is hierarchical clustering
#we need to decide the distance between 2 models and between 2 brands The importance of a dendogram is that you can decide where do you want to cut so that the output is robust and consistent. This process is also a bit time consumerism as it requires lot of calculations and edges. We are going to use a phylogenic tree to be able to visualize all the models:
# library(igraph)
#
# fviz_dend(x = hc,
# k = 5,
# color_labels_by_k = TRUE,
# cex = 0.8,
# type = "phylogenic",
# repel = TRUE) + labs(title="Car Models on the US") +
# theme(axis.text.x = element_blank(), axis.text.y = element_blank())We have here a well organized dendogram, which takes a lot of time to be computed.
This method is quite similar to k-means but instead, it computes probabilities based on distributions.
The target is maximizing the “overall likelihood” of the data.
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EEV (ellipsoidal, equal volume and shape) model with 5 components:
##
## log-likelihood n df BIC ICL
## 129830.2 4521 4144 224782.4 224779.2
##
## Clustering table:
## 1 2 3 4 5
## 437 103 173 3523 285
# The clustering is probabilistic: for each country we don't have a unique group but the probabilities the country belongs to each of the groups
head(res.Mclust$z) #This method is choosing the K automatically using the BIC-->summary(res.Mclust)## [,1] [,2] [,3] [,4] [,5]
## Spider Veloce 2000 0.000000e+00 0.000000e+00 1 0.000000e+00 0.000000e+00
## Testarossa 2.803842e-21 6.262712e-36 0 1.000000e+00 5.014010e-21
## Charger 5.162087e-08 1.101178e-238 0 6.883628e-166 9.999999e-01
## B150/B250 Wagon 2WD 3.648588e-22 1.903807e-33 0 1.000000e+00 1.624946e-20
## Legacy AWD Turbo 1.411878e-14 2.121185e-41 0 1.000000e+00 3.695874e-23
## Loyale 3.051487e-20 1.390176e-24 0 1.000000e+00 5.472374e-22
pp = res.Mclust$z
# The tool is assigning the group with highest probability
head(res.Mclust$classification)## Spider Veloce 2000 Testarossa Charger B150/B250 Wagon 2WD
## 3 4 5 4
## Legacy AWD Turbo Loyale
## 4 4
fviz_mclust(object = res.Mclust, what = "BIC", pallete = "jco") +
scale_x_discrete(limits = c(1:10)) This how are selecting k, values of k in x axis and BIC, how is good, in the Y axis. This model recommends us to use K = 9 clusters, too many of them for practical purposes.
The grouping provided by this plot is really similar to the one of the PAM. Electric vehicles and Hybrid ones, have been joined together with some fuel cars and then, fuel cars have been distributed into many different categories, again, hard to classify. So this clusters could be also good, but still I like more K-means output.
As before, comparison: We are going to check whether PAM and EM are or not similar
# Computes the adjusted Rand index comparing two classifications.
# The closer to 1 the more agreement
adjustedRandIndex(res.Mclust$classification, fit.pam$clustering) #pam with prob.clustering comparison## [1] 0.2300684
A heat map is a false color image (based on data frame X) with a dendrogram added to the left side and to the top
#this is a way to visualize our data set using colors. It joins the observations and the variables using clustering.
# This is used to understand very well what is happening in a huge data set.
heatmap(data_models_SC, scale = "none", distfun = function(x){dist(x, method = "euclidean")},
hclustfun = function(x){hclust(x, method = "ward.D2")}, cexRow = 0.7)The target of this project was to organize the chosen data set, that is composed by cars and analyze the consumption of the vehicles in relation with the fuel type they use and the money you can save each month by using one or other car. In the data´s website, https://www.fueleconomy.gov/feg/ws/index.shtml#vehicle. they have a tool that compares automatically each car with other models and lot of information to play with. I have chosen this topic, cars, because I like this industry a lot, and because nowadays there is a lot of problems in the car industry due to contamination issues.
We have been capable also of classifying the cars in many groups. We have make 2, 3, 4 and 5 different categories, and the one I have liked the most was the easiest. K-means has provided us a real good grouping since my point of view, also has been for PAM, as the outputs were different but right for me.
Mahalanobis and EM despite that are the theoretical best ones, seem to be highly time consuming and I did not like much the provided output.
The conclusion we can obtain from this data set is that Electric cars seem to be really good in all sense, specially Tesla company. They manufacture the best car models, which means, the ones with a higher youSaveSpend variable that relates the money that you save or spend, forgive the redundancy, a higher range in terms of electric models and the highest performance on the acceleration theme.
Despite all of this feeling about Pro-Electric technology, further analysis that are not reachable with this data, have proved that some electric cars, due to the battery life expectancy (the small particles that they emit into the atmosphere and the resources from where their electricity came from) are contaminating more than some eco-friendly gasoline/fuel cars.
So, in relation with noise, particle emissions in the moment of circulating and acceleration, Electric vehicles are the best ones, but as we could see in all the clusters, the groups with Hybrid versions, despite that they are more expensive due to the cost of having two engines, could be also a good solution for the moment.
If in the future the batteries technology improves, that probably will, or the vehicles using hydrogen start to be developed and engineers find a way to deal with the hydrogen problems, will be also a valid solution.